////////////////////////////////////////////////////////////////////////////////
*Cleaning SOEP data (Germany)
////////////////////////////////////////////////////////////////////////////////

cd "$germanydata"

*Data on location
use "raw/hbrutto.dta", clear
	rename (syear sampreg) (year region)
keep hid year region
tempfile regions
save `regions'

*Unemployment data
import delimited "raw/unemployment.csv", asdouble clear
	rename unemploymentrate urate
	replace urate=urate/100
save "clean/unemployment.dta", replace

*Prepare weights
use "raw/phrf.dta", clear
	keep pid *phrf

*Rename weight variables to prepare to put the data in long format	
	local y = 1984
	foreach i in `c(alpha)' {
		rename `i'phrf phrf`y'
		local ++y
	}
	
	local y=2010
	foreach i in a b c d e f g h i {
		rename b`i'phrf phrf`y'
		local ++y
	}

*Reshape to long format
reshape long phrf, i(pid) j(year)
	tempfile weights
save `weights'

*Education
use "raw/pgen.dta", clear

*Marriage status
	rename pgfamstd marital
	drop if marital<0

*Education variable
	rename pgisced97 educ
	rename syear year

*Job tenure variable
	rename pgerwzeit jobtenure

*Employment indicator
	gen employed2=1 if inrange(pgemplst,1,4)
	replace employed2=0 if inrange(pgemplst,5,6)
	keep pid educ marital jobtenure employed2 year
save "temp/employed_help.dta", replace

	drop employed2	
	
tempfile educ
save `educ'

*Employment dataset
use "raw/pl.dta", clear
	rename syear year
	merge 1:1 pid year using "temp/employed_help.dta", keepusing(employed2)
	drop if _merge==2
	drop _merge

cap erase "temp/employed_help.dta"

*Employment variable	
	gen employed=.
	replace employed=1 if plb0037_h>0
	recode employed .=0

*Job change variable
	gen job_change=1 if plb0031_h==1|plb0031_h==3
	replace job_change=0 if plb0031_h==2
	
*Temporary employment variable
	gen temp=.
	replace temp=1 if plb0037_h==2
	replace temp=0 if plb0037_h==1
	
*Self-employment variable
	gen semp=1 if plb0037_h==3|plb0037_h==4
	replace semp=1 if inrange(plb0057_h,1,6)
	replace semp=0 if employed==1&semp==.
*Self-employed mostly answer "doesn't apply" from 2013 on--we can adjust for this
	replace semp=1 if year>=2013&plb0037_h==-2&employed2==1

*Part-time worker variable
	gen parttime=1 if plb0022_h==2
	replace parttime=0 if plb0022_h>0&plb0022_h!=2
	
*Marginally employed variable--check up on this
	gen marginal=1 if plb0022_h==4
	recode marginal .=0

*ISCO codes
	gen isco=""
	tostring p_isco88, gen(isco88)
	tostring p_isco08, gen(isco08)
	replace isco88=substr(isco88, 1, 1) if p_isco88>0
	replace isco08=substr(isco08, 1, 1) if p_isco08>0
	replace isco88="0" if p_isco88==100
	replace isco08="0" if p_isco08==100
	replace isco=isco88 if p_isco88>0
	replace isco=isco88
	replace isco=isco08 if year==2018
	destring isco, replace
	replace isco=. if isco<0

	rename plh0042 jobsecurity
	rename plh0173 jobsat
	
	keep pid year temp semp isco jobsecurity jobsat employed job_change parttime ///
marginal employed2 plb0037_h plb0433*
	tempfile economics
save `economics'

*Demographic dataset
use "raw/ppathl.dta", clear

*Gender variable
	gen male=1 if sex==1
	replace male=0 if sex==2
	
*Age variable
	gen age=(piyear)-(gebjahr+(gebmonat/12)) if piyear>0&gebjahr>0
	replace age=round(age)
	gen age_squared=(age/10)^2	

*Not German-born indicator
	gen immigrant=1 if germborn==2
	replace immigrant=0 if germborn==1
	
	rename syear year
	keep pid hid year male age age_squared immigrant
	drop if year<0|pid<0
	
	tempfile demographics
save `demographics'

*Merge everything
use `economics'

merge 1:1 pid year using `demographics'
	keep if _merge==3
	drop _merge

merge 1:1 pid year using `educ'
	keep if _merge==3
	drop _merge
	
merge 1:1 pid year using `weights'
	keep if _merge==3
	drop _merge

merge m:1 hid year using `regions'
	keep if _merge==3
	drop _merge
	replace region=1 if year<1992
	recode region -8=. -2=.
	
	gen insecure=1 if jobsecurity==1
	replace insecure=0 if jobsecurity>1&!missing(jobsecurity)
	rename phrf xw
	replace temp=0 if employed==1&temp==.
	replace semp=0 if employed==1&semp==.
	replace employed=1 if employed==0&(temp==1|semp==1)
	drop if educ<0
	rename educ education
	
	gen uni=1 if education>=5
	replace uni=0 if education<=4
	
	sort pid year
	xtset pid year
	
	drop if isco==0
	
* In early years, only get asked about job characteristics if they have changed
	forval year=1986/2018 {
		replace temp=L.temp if year==`year'&job_change==0&temp==.
		replace semp=L.semp if year==`year'&job_change==0&semp==.
		replace parttime=L.parttime if year==`year'&job_change==0&parttime==.
		replace isco=L.isco if year==`year'&job_change==0&isco==.
	}
	replace employed=employed2
	recode employed .=0
	drop employed2
	
	gen ddr=(region==2)
	
	label var pid "Person ID"
	label var year "Year"
	label var jobsecurity "Worried about job security?"
	label var employed "Employed"
	label var temp "Temporary worker"
	label var semp "Self-employed without employees"
	label var isco "ISCO-08, single-digit"
	label var male "Male"
	label var age "Age"
	label var age_squared "(Age/10) squared"
	label var education "Highest level of education attained"
	label var xw "Cross-sectional person weight"
	label var insecure "Insecure in job?"
	label var immigrant "Born outside Germany?"
	label var uni "University education"
	label var jobtenure "Job tenure"

	gen insample=!missing(year, temp, parttime, semp, immigrant, marital, isco, ///
male, age, age_squared, uni, jobtenure, marginal, region, xw, insecure)&employed==1&jobtenure>=0&age<=65

save "clean/soep_clean.dta", replace

////////////////////////////////////////////////////////////////////////////////

*Labor market flows data from OECD, FRED, and BfA

////////////////////////////////////////////////////////////////////////////////

cd "$germanydata"

*Prepare weights
use "raw/phrf.dta", clear
	keep pid *phrf *pbleib

*Rename weight variables to prepare to put the data in long format	
	local y = 1984
	foreach i in `c(alpha)' {
		rename `i'phrf phrf`y'
		cap rename `i'pbleib bleib`y'
		local ++y
	}
	
	local y=2010
	foreach i in a b c d e f g h i {
		rename b`i'phrf phrf`y'
		rename b`i'pbleib bleib`y'
		local ++y
	}

*Reshape to long format
reshape long phrf bleib, i(pid) j(year)

xtset pid year
gen lwght = bleib * L.phrf
rename phrf xw
keep pid year lwght xw

tempfile weights
save `weights'

use "raw/pgen.dta", clear
	gen employed2=1 if inrange(pgemplst,1,4)
	replace employed2=0 if inrange(pgemplst,5,6)
	rename syear year
	keep pid year employed2

tempfile employed2
save `employed2'

use "raw/pl.dta", clear
	rename syear year

*Employment variable
	gen employed=.
	replace employed=1 if plb0037_h>0
	recode employed .=0
	
merge 1:1 pid year using `employed2'
	keep if _merge==3
	drop _merge
replace employed=employed2
	
xtset pid year

*Separation variables
gen separation = (plb0282_h==1)
gen layoff = (separation==1 & inlist(plb0304_h,1,3,5))

keep pid year employed separation layoff

merge 1:1 pid year using `weights'
	keep if _merge==3
	drop _merge

replace employed = xw * employed
replace separation = lwght * separation
replace layoff = lwght * layoff
collapse (sum) employed separation layoff, by(year)

tsset year
gen sep_total = F.separation/employed
gen layoff_total = F.layoff/employed

save "clean/germany_flows.dta", replace

cd "$germanydata/temp"
local files: dir "`c(pwd)'" files "*"

foreach file of local files {
	cap erase `file'
}
